引言

这里直接读取作者给定的第一个病人的Gene expression analysis: discovery patient PBMC,用的是 10x genomics 3’ Chromium expression assay

Following sequence alignment and filtering, a total of 12,874 cells were analyzed.

最后是 17,712 genes and 12,874 cells, 所以对计算机的考验很大,而且文章使用的是seurat2,所以这里我们选取并且安装seurat2

载入必要的R包

需要自行下载安装一些必要的R包! 而且需要注意版本 Seurat

因为大量学员在中国大陆,通常不建议大家使用下面的R包安装方法,建议是切换镜像后再下载R包。

参考:http://www.bio-info-trainee.com/3727.html

# 下面代码不运行。
# Enter commands in R (or R studio, if installed)
# Install the devtools package from Hadley Wickham
install.packages('devtools')
# Replace '2.3.0' with your desired version
devtools::install_version(package = 'Seurat', version = package_version('2.3.0'))

library(Seurat)

加载R包

rm(list = ls()) # clear the environment
#load all the necessary libraries
options(warn=-1) # turn off warning message globally
suppressMessages(library(Seurat))

读入文章关于第一个病人的PBMC表达矩阵

start_time <- Sys.time()
# 如果觉得这里较慢,可以使用 data.table 包的 fread函数。
raw_dataPBMC <- read.csv('../Output_2018-03-12/GSE117988_raw.expMatrix_PBMC.csv.gz', header = TRUE, row.names = 1)
end_time <- Sys.time()
end_time - start_time
## Time difference of 1.172894 mins
# 通常电脑一分钟可以搞定。

dim(raw_dataPBMC) 
## [1] 17712 12874
start_time <- Sys.time()
# 按照列,对每一个细胞进行内部归一化,主要是统一文库大小。
dataPBMC <- log2(1 + sweep(raw_dataPBMC, 2, median(colSums(raw_dataPBMC))/colSums(raw_dataPBMC), '*')) # Normalization
end_time <- Sys.time()
end_time - start_time
## Time difference of 30.52906 secs
# 下面是简单的字符串切割,  ExtractField {Seurat}
head(colnames(dataPBMC))
## [1] "AAACCTGAGCGAAGGG.1" "AAACCTGAGGTCATCT.1" "AAACCTGAGTCCTCCT.1"
## [4] "AAACCTGCACCAGCAC.1" "AAACCTGGTAACGTTC.1" "AAACCTGGTAAGGATT.1"
timePoints <- sapply(colnames(dataPBMC), function(x) ExtractField(x, 2, '[.]'))
timePoints <-ifelse(timePoints == '1', 'PBMC_Pre', 
                    ifelse(timePoints == '2', 'PBMC_EarlyD27',
                           ifelse(timePoints == '3', 'PBMC_RespD376', 'PBMC_ARD614')))
table(timePoints)
## timePoints
##   PBMC_ARD614 PBMC_EarlyD27      PBMC_Pre PBMC_RespD376 
##          4516          1592          2082          4684
# 可以看到是治疗前,加上治疗中的3个时间点,这4个分组信息。

表达矩阵的质量控制

简单看看表达矩阵的性质,主要是基因数量,细胞数量;

以及每个细胞表达基因的数量,和每个基因在多少个细胞里面表达。

dim(dataPBMC)
## [1] 17712 12874
# 可以看到,近2万的基因里面,
# 绝大部分基因只在一万多细胞的200个不到的表达
fivenum(apply(dataPBMC,1,function(x) sum(x>0) ))
## RP4-669L17.10         LAMB3         NAT10    AC093673.5         RPL21 
##             1             6            50           207         12102
boxplot(apply(dataPBMC,1,function(x) sum(x>0) ))

# 可以看到,一万多细胞里面
# 绝大部分细胞只能检测不到500个基因。
fivenum(apply(dataPBMC,2,function(x) sum(x>0) ))
## CAAGAAATCGATCCCT.2 GGCCGATTCCAGAGGA.3 TCAACGAAGAGCTGGT.3 
##                 10                321                395 
## TGCCAAAGTCTGCGGT.4 TCTGGAATCTATCGCC.3 
##                481               2865
hist(apply(dataPBMC,2,function(x) sum(x>0) ))

然后创建Seurat的对象

start_time <- Sys.time()
# Create Seurat object
PBMC <- CreateSeuratObject(raw.data = dataPBMC, 
                           min.cells = 1, min.genes = 0, project = '10x_PBMC') # already normalized
dim(dataPBMC)
## [1] 17712 12874
PBMC # 17,712 genes and 12,874 cells
## An object of class seurat in project 10x_PBMC 
##  17712 genes across 12874 samples.
# 可以看到上面创建Seurat对象的那些参数并没有过滤基因或者细胞。
end_time <- Sys.time()
end_time - start_time
## Time difference of 8.503639 secs
# Add meta.data (nUMI and timePoints)
PBMC <- AddMetaData(object = PBMC, 
                    metadata = apply(raw_dataPBMC, 2, sum),
                    col.name = 'nUMI_raw')
PBMC <- AddMetaData(object = PBMC, metadata = timePoints, col.name = 'TimePoints')

一些质控

这里绘图,可以指定分组,前提是这个分组变量存在于meta信息里面,我们创建对象后使用函数添加了 TimePoints 属性,所以可以用来进行可视化。

这里是:‘TimePoints’

sce=PBMC
VlnPlot(object = sce, 
        features.plot = c("nGene", "nUMI"), 
        group.by = 'TimePoints', nCol = 2)

GenePlot(object = sce, gene1 = "nUMI", gene2 = "nGene")

可以看看高表达量基因是哪些

tail(sort(Matrix::rowSums(sce@raw.data)))
##   EEF1A1   RPL13A    RPLP1   TMSB4X      B2M   MALAT1 
## 37203.12 38345.11 38977.99 43027.61 46854.95 62363.12
## 散点图可视化任意两个基因的一些属性(通常是细胞的度量)
# 这里选取两个基因。
tmp=names(sort(Matrix::rowSums(sce@raw.data),decreasing = T))
GenePlot(object = sce, gene1 = tmp[1], gene2 = tmp[2])

# 散点图可视化任意两个细胞的一些属性(通常是基因的度量)
# 这里选取两个细胞
CellPlot(sce,sce@cell.names[3],sce@cell.names[4],do.ident = FALSE)

最后标准聚类可视化

很简单的流程,先ScaleData,再FindVariableGenes,然后根据找到的高变异基因进行RunPCA,再根据PCA结果进行FindClusters即可,最后再RunTSNE后进行可视化。

start_time <- Sys.time()
# 最耗费时间的步骤在这里。
PBMC <- ScaleData(object = PBMC, vars.to.regress = c('nUMI_raw'), model.use = 'linear', use.umi = FALSE)
## NormalizeData has not been run, therefore ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
## ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
## 
## Time Elapsed:  53.2460291385651 secs
end_time <- Sys.time()
end_time - start_time
## Time difference of 1.309101 mins
start_time <- Sys.time()
PBMC <- FindVariableGenes(object = PBMC, 
                          mean.function = ExpMean,
                          dispersion.function = LogVMR,
                          x.low.cutoff = 0.0125,
                          x.high.cutoff = 3, 
                          y.cutoff = 0.5)

head(PBMC@var.genes)
## [1] "AP006222.2" "ISG15"      "TNFRSF18"   "TNFRSF4"    "MMP23B"    
## [6] "SMIM1"
length(PBMC@var.genes)
## [1] 1203
PBMC <- RunPCA(object = PBMC, pc.genes = PBMC@var.genes)
## [1] "PC1"
##  [1] "S100A9"        "CST3"          "FTH1"          "LYZ"          
##  [5] "S100A8"        "SAT1"          "AIF1"          "GPX1"         
##  [9] "LST1"          "FCN1"          "CTSS"          "CSTA"         
## [13] "S100A12"       "NEAT1"         "LGALS2"        "VCAN"         
## [17] "RP11-1143G9.4" "TYMP"          "FOS"           "MNDA"         
## [21] "PPBP"          "SERPINA1"      "AP1S2"         "MS4A6A"       
## [25] "MAFB"          "CD14"          "DUSP1"         "NAMPT"        
## [29] "ACTB"          "MT-ND1"       
## [1] ""
##  [1] "NKG7"     "IL32"     "GNLY"     "LTB"      "GZMH"     "KLRB1"   
##  [7] "CTSW"     "ALAS2"    "GZMB"     "FGFBP2"   "HBD"      "AHSP"    
## [13] "HLA-C"    "CMC1"     "CD247"    "TRBC1"    "KLRD1"    "GZMM"    
## [19] "PRF1"     "IL2RG"    "CD3E"     "ACAP1"    "LCK"      "CD2"     
## [25] "CA1"      "TRDC"     "SLC25A39" "ISG20"    "BPGM"     "EVL"     
## [1] ""
## [1] ""
## [1] "PC2"
##  [1] "PF4"       "GNG11"     "SDPR"      "HIST1H2AC" "PPBP"     
##  [6] "TUBB1"     "ACRBP"     "RGS18"     "CLU"       "NRGN"     
## [11] "SPARC"     "GP9"       "MAP3K7CL"  "NGFRAP1"   "TSC22D1"  
## [16] "NCOA4"     "TREML1"    "MMD"       "RUFY1"     "PGRMC1"   
## [21] "PTCRA"     "CLEC1B"    "HIST1H3H"  "C6orf25"   "CMTM5"    
## [26] "ITGA2B"    "TMEM40"    "MYL9"      "TUBA4A"    "TAGLN2"   
## [1] ""
##  [1] "S100A9"        "LYZ"           "S100A8"        "FCN1"         
##  [5] "CTSS"          "AIF1"          "LST1"          "CSTA"         
##  [9] "VCAN"          "S100A12"       "RP11-1143G9.4" "LGALS2"       
## [13] "NEAT1"         "DUSP1"         "FOS"           "TYMP"         
## [17] "SERPINA1"      "MNDA"          "CD14"          "MAFB"         
## [21] "MS4A6A"        "NAMPT"         "CST3"          "SPI1"         
## [25] "MCL1"          "CLEC7A"        "G0S2"          "TKT"          
## [29] "CFP"           "LINC00936"    
## [1] ""
## [1] ""
## [1] "PC3"
##  [1] "NKG7"     "HLA-C"    "GZMB"     "GNLY"     "S100A4"   "FGFBP2"  
##  [7] "GZMH"     "CTSW"     "ACTB"     "PRF1"     "KLRB1"    "KLRD1"   
## [13] "MT-ATP6"  "CD247"    "CMC1"     "IL32"     "FCGR3A"   "MT-CYB"  
## [19] "SPON2"    "KLRF1"    "TRDC"     "GZMM"     "RHOC"     "CCL4"    
## [25] "CLIC3"    "LITAF"    "MT-ND3"   "C12orf75" "TRBC1"    "BIN2"    
## [1] ""
##  [1] "ALAS2"    "AHSP"     "SNCA"     "HBD"      "SLC25A37" "CA1"     
##  [7] "HBM"      "CD79A"    "SELENBP1" "SLC25A39" "BLVRB"    "BPGM"    
## [13] "IGKC"     "IGHD"     "GMPR"     "MPP1"     "IGHM"     "EIF1AY"  
## [19] "TCL1A"    "BNIP3L"   "STRADB"   "CD79B"    "IGLC2"    "MS4A1"   
## [25] "IFIT1B"   "FAM210B"  "YBX3"     "HLA-DQA1" "DCAF12"   "HEMGN"   
## [1] ""
## [1] ""
## [1] "PC4"
##  [1] "ALAS2"    "AHSP"     "HBD"      "SNCA"     "SLC25A37" "CA1"     
##  [7] "HBM"      "BLVRB"    "SLC25A39" "BPGM"     "SELENBP1" "BNIP3L"  
## [13] "MKRN1"    "FAM210B"  "FECH"     "MPP1"     "IFIT1B"   "DCAF12"  
## [19] "GMPR"     "BSG"      "STRADB"   "HEMGN"    "LGALS3"   "SLC4A1"  
## [25] "ADIPOR1"  "PITHD1"   "KRT1"     "BCL2L1"   "EIF1AY"   "NKG7"    
## [1] ""
##  [1] "CD74"       "CD79A"      "HLA-DPB1"   "IGHD"       "IGKC"      
##  [6] "CD79B"      "IGHM"       "TCL1A"      "HLA-DRB1"   "HLA-DQA1"  
## [11] "MS4A1"      "IGLC2"      "MT-CYB"     "MT-ND3"     "MT-ATP6"   
## [16] "LTB"        "VPREB3"     "HLA-DRB5"   "MT-ND1"     "BANK1"     
## [21] "PLPP5"      "FCER2"      "LINC00926"  "IGLC3"      "HLA-DMA"   
## [26] "MEF2C"      "BIRC3"      "HLA-DMB"    "CD69"       "GABPB1-AS1"
## [1] ""
## [1] ""
## [1] "PC5"
##  [1] "FCGR3A"   "GZMB"     "HLA-DPB1" "FGFBP2"   "HLA-DRB1" "CD74"    
##  [7] "SPON2"    "PRF1"     "IFITM3"   "RHOC"     "KLRF1"    "NKG7"    
## [13] "HLA-DQA1" "HBD"      "CD79B"    "AHSP"     "CD79A"    "ALAS2"   
## [19] "HLA-DRB5" "FAM26F"   "SNCA"     "TCL1A"    "IGFBP7"   "CLIC3"   
## [25] "CTSW"     "CA1"      "KLRD1"    "IGHD"     "HBM"      "BPGM"    
## [1] ""
##  [1] "IL7R"     "IL32"     "CD3E"     "GZMK"     "LTB"      "MT-CYB"  
##  [7] "CD8B"     "CD3G"     "MT-ND1"   "CD2"      "LEPROTL1" "MT-ND3"  
## [13] "MT-ATP6"  "DUSP2"    "CD8A"     "LYAR"     "SPOCK2"   "S100A12" 
## [19] "S100A8"   "GSTK1"    "TNFAIP3"  "TRBC1"    "C12orf57" "SLC2A3"  
## [25] "VCAN"     "CD44"     "LCK"      "RORA"     "FYB"      "S100A9"  
## [1] ""
## [1] ""
## 避免太多log日志被打印出来。
PBMC <- FindClusters(object = PBMC, 
                     reduction.type = "pca", 
                     dims.use = 1:10, 
                     resolution = 1, 
                     print.output = 0,
                     k.param = 35, save.SNN = TRUE) # 13 clusters
PBMC <- RunTSNE(object = PBMC, dims.use = 1:10)
# 配色这里直接使用文章配色方案。 
TSNEPlot(PBMC, colors.use = c('green4', 'pink', '#FF7F00', 'orchid', '#99c9fb', 'dodgerblue2', 'grey30', 'yellow', 'grey60', 'grey', 'red', '#FB9A99', 'black'))

end_time <- Sys.time()
end_time - start_time
## Time difference of 1.144884 mins

输出seurat结果后面使用

start_time <- Sys.time()
save(PBMC,file = 'patient1.PBMC.output.Rdata')
end_time <- Sys.time()
end_time - start_time
## Time difference of 1.414166 mins
# 这个步骤会输出文件 1.75G 

最后,这 13 clusters要进行注释,才能发表,如下所示:

作者文章里面是Representative marker genes shown in Supplementary Fig. 7. 如下所示

可以看到作者对PBMC里面的细胞都挑选了一个基因就命名了。

显示运行环境

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] Seurat_2.3.4  Matrix_1.2-14 cowplot_0.9.3 ggplot2_3.2.0
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.15          colorspace_1.4-0    class_7.3-14       
##   [4] modeltools_0.2-22   ggridges_0.5.1      mclust_5.4.1       
##   [7] rprojroot_1.3-2     htmlTable_1.13.1    base64enc_0.1-3    
##  [10] rstudioapi_0.9.0    proxy_0.4-22        npsurv_0.4-0       
##  [13] flexmix_2.3-14      bit64_0.9-7         codetools_0.2-15   
##  [16] splines_3.5.1       R.methodsS3_1.7.1   lsei_1.2-0         
##  [19] robustbase_0.93-3   knitr_1.21          jsonlite_1.6       
##  [22] Formula_1.2-3       ica_1.0-2           cluster_2.0.7-1    
##  [25] kernlab_0.9-27      png_0.1-7           R.oo_1.22.0        
##  [28] compiler_3.5.1      httr_1.3.1          backports_1.1.3    
##  [31] assertthat_0.2.0    lazyeval_0.2.1      lars_1.2           
##  [34] acepack_1.4.1       htmltools_0.3.6     tools_3.5.1        
##  [37] bindrcpp_0.2.2      igraph_1.2.2        gtable_0.2.0       
##  [40] glue_1.3.0          RANN_2.6            reshape2_1.4.3     
##  [43] dplyr_0.7.8         Rcpp_1.0.0          gdata_2.18.0       
##  [46] ape_5.2             nlme_3.1-137        iterators_1.0.10   
##  [49] fpc_2.2-3           gbRd_0.4-11         lmtest_0.9-36      
##  [52] xfun_0.4            stringr_1.3.1       irlba_2.3.2        
##  [55] gtools_3.8.1        DEoptimR_1.0-8      MASS_7.3-50        
##  [58] zoo_1.8-4           scales_1.0.0        doSNOW_1.0.16      
##  [61] parallel_3.5.1      RColorBrewer_1.1-2  yaml_2.2.0         
##  [64] reticulate_1.10     pbapply_1.3-4       gridExtra_2.3      
##  [67] rpart_4.1-13        segmented_0.5-3.0   latticeExtra_0.6-28
##  [70] stringi_1.2.4       foreach_1.4.4       checkmate_1.9.1    
##  [73] caTools_1.17.1.1    bibtex_0.4.2        Rdpack_0.10-1      
##  [76] SDMTools_1.1-221    rlang_0.3.1         pkgconfig_2.0.2    
##  [79] dtw_1.20-1          prabclus_2.2-6      bitops_1.0-6       
##  [82] evaluate_0.12       lattice_0.20-35     ROCR_1.0-7         
##  [85] purrr_0.3.0         bindr_0.1.1         labeling_0.3       
##  [88] htmlwidgets_1.3     bit_1.1-14          tidyselect_0.2.5   
##  [91] plyr_1.8.4          magrittr_1.5        R6_2.3.0           
##  [94] snow_0.4-3          gplots_3.0.1.1      Hmisc_4.2-0        
##  [97] pillar_1.3.1        foreign_0.8-70      withr_2.1.2        
## [100] fitdistrplus_1.0-11 mixtools_1.1.0      survival_2.42-3    
## [103] nnet_7.3-12         tsne_0.1-3          tibble_2.0.1       
## [106] crayon_1.3.4        hdf5r_1.0.1         KernSmooth_2.23-15 
## [109] rmarkdown_1.10      grid_3.5.1          data.table_1.12.0  
## [112] metap_1.0           digest_0.6.18       diptest_0.75-7     
## [115] tidyr_0.8.1         R.utils_2.7.0       stats4_3.5.1       
## [118] munsell_0.5.0